HW 02

Author

Lucas Smith

install.packages("fs",repos = "http://cran.us.r-project.org")

The downloaded binary packages are in
    /var/folders/_d/hvsdqqnd3jddpd6y2p2zdpt40000gn/T//RtmpzcCip1/downloaded_packages
library("ggridges")
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(scales)

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor

1 - A new day, a new plot, a new geom

bnb <- dsbox::edibnb

bnb <- bnb |> group_by(neighbourhood) |> summarise(median(review_scores_rating, na.rm=TRUE))
bnb <- dsbox::edibnb
bnb <- bnb |> mutate(neighbourhood=fct_reorder(neighbourhood, review_scores_rating, .fun='median'))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `neighbourhood = fct_reorder(neighbourhood,
  review_scores_rating, .fun = "median")`.
Caused by warning:
! `fct_reorder()` removing 2177 missing values.
ℹ Use `.na_rm = TRUE` to silence this message.
ℹ Use `.na_rm = FALSE` to preserve NAs.
ggplot(bnb, aes(x=review_scores_rating, y=neighbourhood)) + 
  geom_density_ridges() + 
  labs(
    title="Average AirBnB Ratings across Neighbourhoods",
    y="Neighbourhood",
    x="Review Rating"
  )
Picking joint bandwidth of 1.21
Warning: Removed 2177 rows containing non-finite outside the scale range
(`stat_density_ridges()`).

Interpretation:

  • Interesting to see that most values are near 100.

  • There are also blips just around the 80 marks, and this is consistent across every neighbourhood.

  • I can’t tell from the graph itself, but it could be that number of observations are so small as the review rating decreases that it shows a continuing line. The other possibility is that those values are all nonexistent, but I do not think that would be an accurate representation on the graph. “Zoomed out” at this scale, it is hard to decipher the meaning.

2 - Foreign Connected PACs

library(fs)
# get a list of files with "Foreign Connected PAC" in their names
list_of_files <- dir_ls(path = "data", regexp = "Foreign Connected PAC")

# read all files and row bind them
# keeping track of the file name in a new column called year
pac <- read_csv(list_of_files, id = "year")
Rows: 2394 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): PAC Name (Affiliate), Country of Origin/Parent Company, Total, Dems...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# read all files and row bind them
# keeping track of the file name in a new column called year
pac <- read_csv(list_of_files, id = "year")
Rows: 2394 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): PAC Name (Affiliate), Country of Origin/Parent Company, Total, Dems...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pac <- clean_names(pac)
pac <- pac |>
  extract(
    year,
    into = "year",
    regex = "-([0-9]{4})"
  )  |> mutate(year=as.integer(year))
pac <- pac |> extract(dems, into="dems", regex="([0-9]+)") |> mutate(dems=as.double(dems))
pac <- pac |> extract(repubs, into="repubs", regex="([0-9]+)") |> mutate(repubs=as.double(repubs))
pac <- pac |> extract(country_of_origin_parent_company, c("Country", "Parent Company"), "([\\w ]+)/([\\w\\D ]+)") 
pac2 <- pac |> pivot_longer(cols=c("dems", "repubs"), names_to="party", values_to="amount")
pac2['party'] = case_when(pac2['party']=='dems' ~ 'Democrat', pac2['party']=='repubs' ~ 'Republican', 
                          .default='Not Listed')
uk <- pac2 |> 
    filter(Country == "UK") |> 
    group_by(year, party) |> 
    summarise(total=sum(amount))
`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.
ggplot(uk, aes(x=year, y=total, color=party)) + 
  geom_line(linewidth=1) +
  # make sacle 1M, 2M, 3M
  scale_y_continuous(labels = label_currency(scale_cut=cut_short_scale())) + 
  theme(
    panel.background = element_rect(fill="white"),
    panel.grid.major.x = element_line(color="lightgrey"),
    panel.grid.major.y = element_line(color="lightgrey"),
    panel.grid.minor.x = element_line(color="lightgrey"),
    panel.grid.minor.y = element_line(color="lightgrey"),
    legend.position=c(.9, .15),
    legend.background = element_blank(),
    plot.margin = margin(1, 1, 5, 1)
    ) + 
  scale_color_manual(labels = c("Democrat", "Republican"), values = c("blue", "red")) + 
  labs(
    color = "Party", x="", y="", title="Contributions to US political parties from UK-connected PACs"
  ) + 
  # allow for text outside of plotting area
  coord_cartesian(clip="off") + 
  annotate("text", label="Source: OpenSecrets.org", x=2020.7, y=1e6, vjust=7, size=3) + 
  annotate("text", label="Total Amount", x=2000, y=1e6, vjust=-6, size=4, hjust=.3, angle=90) + 
  annotate("text", label="Year", x=2000, y=1e6, vjust=5, size=4, hjust=1)
Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
3.5.0.
ℹ Please use the `legend.position.inside` argument of `theme()` instead.

france <- pac2 |> 
    filter(Country == "France") |> 
    group_by(year, party) |> 
    summarise(total=sum(amount))
`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.
ggplot(france, aes(x=year, y=total, color=party)) + 
  geom_line(linewidth=1) +
  # make sacle 1M, 2M, 3M
  scale_y_continuous(labels = label_currency(scale_cut=cut_short_scale())) + 
  theme(
    panel.background = element_rect(fill="white"),
    panel.grid.major.x = element_line(color="lightgrey"),
    panel.grid.major.y = element_line(color="lightgrey"),
    panel.grid.minor.x = element_line(color="lightgrey"),
    panel.grid.minor.y = element_line(color="lightgrey"),
    legend.position=c(.9, .15),
    legend.background = element_blank(),
    plot.margin = margin(1, 1, 5, 1)
    ) + 
  scale_color_manual(labels = c("Democrat", "Republican"), values = c("blue", "red")) + 
  labs(
    color = "Party", x="", y="", title="Contributions to US political parties from French-connected PACs"
  ) + 
  # allow for text outside of plotting area
  coord_cartesian(clip="off") + 
  annotate("text", label="Source: OpenSecrets.org", x=2020.3, y=200000, vjust=6.5, size=3) + 
  annotate("text", label="Total Amount", x=2000, y=200000, vjust=-7, size=4, hjust=.4, angle=90) + 
  annotate("text", label="Year", x=2000, y=200000, vjust=5, size=4, hjust=1)

It’s interesting to find that there is a similar trend between the UK and French political contributions. We see an increase around 2008 for democrats, and a decrease for republicans around 2018.

3 - Median housing prices in the US

medhousing <- read_csv("data/median-housing.csv")
Rows: 234 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl  (1): MSPUS
date (1): DATE

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
recessions <- read_csv("data/recessions.csv")
Rows: 34 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
date (2): Peak, Trough

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
medhousing <- rename(medhousing, all_of(c(date="DATE", price="MSPUS")))
library(scales)
ggplot(medhousing, aes(x=date, y=price)) +
  geom_line(color="#5571DC", linewidth=1) + 
  scale_y_continuous(limits=c(0,400000), labels=label_comma(), n.breaks=13) + 

  labs(
    x="", y="Price", title="Median sales price of houses sold in the United States",
    subtitle="Not seasonally adjusted"
  ) + 
  scale_x_date(date_breaks="5 years", date_labels="%Y") + 
  # annotation_custom(textGrob("Sources: Census; HUD"))
   annotate("text", label="Sources: Census; HUD", x=ymd("2015-12-01"), y=0, vjust=5) + 
  coord_cartesian(clip = "off") + 
    theme(
    panel.grid.minor.x=element_blank(),
    panel.grid.minor.y=element_blank(),
    panel.grid.major.x=element_blank(),
    panel.background = element_rect(fill = "white"),
    panel.grid.major.y=element_line(color="lightgrey"),
    axis.ticks=element_blank(),
    plot.margin = unit(c(1,1,2,1), "lines")
  ) 

library(data.table)

Attaching package: 'data.table'
The following objects are masked from 'package:lubridate':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year
The following objects are masked from 'package:dplyr':

    between, first, last
The following object is masked from 'package:purrr':

    transpose
# get min and max housing date
min_housing_date = min(medhousing$date)
max_housing_date = max(medhousing$date)
# i realized that the next 5 lines aren't needed.
recessions['occurredDuringHousing'] = ifelse(recessions$Peak >= min_housing_date & recessions$Trough <= max_housing_date, TRUE, FALSE)


medhousing['prev_year_price'] =  shift(medhousing$price, 1, type='lag')
medhousing['decline'] = ifelse(medhousing$price < medhousing$prev_year_price, TRUE, FALSE)
medhousing['nextdate'] = shift(medhousing$date, -1)


# create base plot 
baseplot <- ggplot(medhousing, aes(x=date, y=price)) +
  scale_y_continuous(limits=c(0,400000), labels=label_comma(), n.breaks=13) + 
  labs(
    x="", y="Price", title="Median sales price of houses sold in the United States",
    subtitle="Not seasonally adjusted"
  ) + 
  scale_x_date(date_breaks="5 years", date_labels="%Y") + 
   annotate("text", label="Shaded areas indicate U.S. recessions", x=ymd("2015-12-01"), y=0, vjust=5, hjust=.7) + 
  annotate("text", label="Sources: Census; HUD", x=ymd("2015-12-01"), y=0, vjust=7) + 
  coord_cartesian(clip = "off") + 
    theme(
    panel.grid.minor.x=element_blank(),
    panel.grid.minor.y=element_blank(),
    panel.grid.major.x=element_blank(),
    panel.background = element_rect(fill = "white"),
    panel.grid.major.y=element_line(color="lightgrey"),
    axis.ticks=element_blank(),
    plot.margin = unit(c(1,1,2,1), "lines")
  ) 

# identify recessions
identifiedRecessions <- recessions |> filter(recessions$occurredDuringHousing == TRUE)

for (i in 1:nrow(identifiedRecessions)) {
  row <- identifiedRecessions[i,]
  baseplot <- baseplot + annotate("rect", xmin=row$Peak, xmax=row$Trough, ymin=0, ymax=40e4, fill="#E6E6E6", alpha=1)
}

baseplot + geom_line(color="#5571DC",linewidth=1)

install.packages("zoo",repos = "http://cran.us.r-project.org")

The downloaded binary packages are in
    /var/folders/_d/hvsdqqnd3jddpd6y2p2zdpt40000gn/T//RtmpzcCip1/downloaded_packages
library(zoo)

Attaching package: 'zoo'
The following objects are masked from 'package:data.table':

    yearmon, yearqtr
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
medhousingSubset <- medhousing |> filter(date >= ymd("2019-01-01") & date <= ymd("2020-12-31"))
medhousingSubset <- medhousingSubset |> mutate(year = year(date)) |> mutate(quarter = case_when(
  month(date) %in% c(1, 2, 3) ~ "Q1",
  month(date) %in% c(4, 5, 6) ~ "Q2",
  month(date) %in% c(7, 8, 9) ~ "Q3",
  month(date) %in% c(10, 11, 12) ~ "Q4",
  .default = ""
  
)) |> mutate(posixdate = as.POSIXct(date))

ggplot(medhousingSubset, aes(y=price, x=date)) + 
  geom_line(color="#5571DC", linewidth=1) + geom_point(color='#5571DC', fill='white', shape=21) + 
  theme(
    panel.background = element_rect(fill="white"),
    panel.grid.major = element_line(color="#EBEBEB"),
    panel.grid.minor.y=element_line(color="#EBEBEB"),
    strip.clip="on",
    axis.ticks = element_blank(),
    plot.margin = unit(c(1, 1, 2, 1), "cm")
    
  )  + xlim(ymd("2019-01-01", ymd("2020-12-31"))) + 
  scale_y_continuous(limits=c(300000, 360000),labels=label_comma(), breaks=seq(30e4,36e4, by = 2e4), expand=c(0,0)) + 
    labs(
    x="", y="Dollars", title="Median sales price of houses sold in the United States",
    subtitle="Not seasonally adjusted"
  ) +
  coord_cartesian(clip = "off") + 
  annotate("text", label="2020", x=ymd("2020-10-01"), y=30e4, vjust=5, hjust=4.) + 
  annotate("text", label="2019", x=ymd("2020-10-01"), y=30e4, vjust=5, hjust=13.5) + 
  scale_x_date(date_breaks = "3 months", labels=quarter, breaks=c(1, 4, 7, 12), expand=c(0,0)) 
Scale for x is already present.
Adding another scale for x, which will replace the existing scale.

4 - Expect More. Plot More.

install.packages("ggforce",repos = "http://cran.us.r-project.org")

The downloaded binary packages are in
    /var/folders/_d/hvsdqqnd3jddpd6y2p2zdpt40000gn/T//RtmpzcCip1/downloaded_packages
library("ggforce")

For sourcing the colors, I did not look this up online. I instead took a screenshot of your photo and did a color picker for the hexadecimal color. You can do so built in from Mac. I verified it from this: https://www.brandcolorcode.com/target

For this process, I made three circles filled in alternating with red, white, red, and layered them with smaller ones in the front.

library(knitr)
library("ggplot2")
library("ggforce")
knitr::opts_chunk$set(
  fig.width = 7,        # 7" width
  fig.asp = .4,      # the golden ratio
  fig.retina = 3,       # dpi multiplier for displaying HTML output on retina
  fig.align = "center", # center align figures
  dpi = 300             # higher dpi, sharper image
)

# creating a data frame
df <- data.frame(col1=sample(rep(c(1, 20, 40), each=26)),
                 col2=sample(rep(c(1: 6), each=13))
                 )

df <- data.frame(col1=c(0, 100), col2=c(0, 100))

# plotting the data
ggplot(df, aes(x=col1, y=col2)) +
  geom_circle(aes(x0=50, y0=50, r=50), fill="#CC0000", color="#CC0000", inherit.aes=FALSE)+
  geom_circle(aes(x0=50, y0=50, r=35), fill="white", color="white", inherit.aes=FALSE)+
  geom_circle(aes(x0=50, y0=50, r=15), fill="#CC0000", color="#CC0000", inherit.aes=FALSE)+

 coord_fixed(clip="off") + 
  theme (
    panel.background = element_blank(),
    axis.title = element_blank(),
    legend.position="none",
    axis.ticks.x = element_blank(),
    axis.ticks.y=element_blank(),
    axis.text.x=element_blank(),
    axis.text.y=element_blank(),
    # plot.margin=margin(10, 10, 10, 10)
  ) + 
  annotate("text", label="TARGET", x=50, y=0, vjust=2, color="#CC0000", size=12, fontface="bold") +
    geom_circle(aes(x0=88, y0=-23, r=4.5), fill="white", color="#CC0000", inherit.aes=FALSE, linewidth=1) + 
    annotate("text", label="R", x=88, y=-23, color="#CC0000", size=4, fontface="bold")
Warning in geom_circle(aes(x0 = 50, y0 = 50, r = 50), fill = "#CC0000", : All aesthetics have length 1, but the data has 2 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.
Warning in geom_circle(aes(x0 = 50, y0 = 50, r = 35), fill = "white", color = "white", : All aesthetics have length 1, but the data has 2 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.
Warning in geom_circle(aes(x0 = 50, y0 = 50, r = 15), fill = "#CC0000", : All aesthetics have length 1, but the data has 2 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.
Warning in geom_circle(aes(x0 = 88, y0 = -23, r = 4.5), fill = "white", : All aesthetics have length 1, but the data has 2 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.

5 - Mirror, mirror on the wall, who’s the ugliest of them all?

install.packages("palmerpenguins",repos = "http://cran.us.r-project.org")

The downloaded binary packages are in
    /var/folders/_d/hvsdqqnd3jddpd6y2p2zdpt40000gn/T//RtmpzcCip1/downloaded_packages
library(palmerpenguins)

Attaching package: 'palmerpenguins'
The following objects are masked from 'package:datasets':

    penguins, penguins_raw
penguins
# A tibble: 344 × 8
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           39.1          18.7               181        3750
 2 Adelie  Torgersen           39.5          17.4               186        3800
 3 Adelie  Torgersen           40.3          18                 195        3250
 4 Adelie  Torgersen           NA            NA                  NA          NA
 5 Adelie  Torgersen           36.7          19.3               193        3450
 6 Adelie  Torgersen           39.3          20.6               190        3650
 7 Adelie  Torgersen           38.9          17.8               181        3625
 8 Adelie  Torgersen           39.2          19.6               195        4675
 9 Adelie  Torgersen           34.1          18.1               193        3475
10 Adelie  Torgersen           42            20.2               190        4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
ggplot(penguins, aes(x=bill_depth_mm, y=bill_length_mm, color=flipper_length_mm)) + 
  geom_point()
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(penguins, aes(x=bill_depth_mm, y=bill_length_mm, color=flipper_length_mm)) + 
  geom_point(shape="\u003C", size=4) + 
  scale_color_gradient(low="pink", high="orange") + 
  scale_x_continuous(limits=c(0, 40)) + 
  theme(
    panel.background = element_rect(fill="red"),
    panel.grid.major.x = element_line(color="green"),
      panel.grid.major.y = element_line(color="blue", linewidth = 2), 
    legend.background = element_rect()
  )
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).